Package org.terrier.indexing

Source Code of org.terrier.indexing.MSPowerpointDocument$PowerpointExtractor

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is MSPowerpointdocument.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
*/
package org.terrier.indexing;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.Map;

import org.apache.log4j.Logger;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.util.LittleEndian;
import org.terrier.indexing.tokenisation.Tokeniser;
/** Implements a Document object for reading Microsoft Powerpoint files.
*  This implementation uses the Jakarta-POI (POIFS) library, so to compile
*   or use this module, you must have the poi-?.?./-final-*.jar in your
*   classpath.
@author Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
*/
public class MSPowerpointDocument extends FileDocument
{
  protected static final Logger logger = Logger.getLogger(MSPowerpointDocument.class);
  /** Constructs a new MSPowerpointDocument object for the passed InputStream
   *  @param filename the file that has been opened
   *   @param docStream the stream of the file */
  public MSPowerpointDocument(String filename, InputStream docStream, Tokeniser tokeniser)
  {
    super(filename, docStream, tokeniser);
  }
  /**
   * Constructs a new MSPowerpointDocument object for the passed InputStream
   * @param docStream
   * @param docProperties
   * @param tok
   */
  public MSPowerpointDocument(InputStream docStream,
      Map<String, String> docProperties, Tokeniser tok) {
    super(docStream, docProperties, tok);
 
  }
  /**
   * Constructs a new MSPowerpointDocument object for the passed InputStream
   * @param docReader
   * @param docProperties
   * @param tok
   */
  public MSPowerpointDocument(Reader docReader,
      Map<String, String> docProperties, Tokeniser tok) {
    super(docReader, docProperties, tok);
  }
  /**
   * Constructs a new MSPowerpointDocument object for the passed InputStream
   * @param filename
   * @param docReader
   * @param tok
   */
  public MSPowerpointDocument(String filename, Reader docReader, Tokeniser tok) {
    super(filename, docReader, tok);
  }

  /** This class implements a POIFSReaderListener, which means is can
   *   can attach to events from the POI libraries, and write the associated
   *  data to a writer object */
  static class PowerpointExtractor implements POIFSReaderListener
  {
    ByteArrayOutputStream writer = new ByteArrayOutputStream();
    public void processPOIFSReaderEvent(POIFSReaderEvent event)
    {
      try{
        if(!event.getName().equalsIgnoreCase("PowerPoint Document"))
          return;
        DocumentInputStream input = event.getStream();
        byte[] buffer = new byte[input.available()];
        input.read(buffer, 0, input.available());
        for(int i=0; i<buffer.length-20; i++)
        {
          long type = LittleEndian.getUShort(buffer,i+2);
          long size = LittleEndian.getUInt(buffer,i+4);
          if(type==4008)
          {
            writer.write(buffer, i + 4 + 1, (int) size +3);
            i = i + 4 + 1 + (int) size - 1;
          }
        }
      }
      catch (Exception e)
      {
        logger.fatal(e.getMessage(),e);
      }
    }
  }
 
  /** This method returns the Reader for the @param docStream
   *   file stream. This involves loading and converting the powerpoint
   *  document.
   *  On failure, returns null, and sets EOD to true, so no terms can
   *  be read from this object.
   */
  protected Reader getReader(InputStream docStream)
  {
    try{
      POIFSReader reader = new POIFSReader();
      PowerpointExtractor ppe = new PowerpointExtractor();
      reader.registerListener(ppe);
      reader.read(docStream);
      return new StringReader(ppe.writer.toString());     
    } catch (Exception e) {
      //logger.warn("WARNING: Problem convering MS Powerpoint document",e);
      EOD = true;
      return null;
    }
  }
}
TOP

Related Classes of org.terrier.indexing.MSPowerpointDocument$PowerpointExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.